mmlu-abstract-algebra:
  id: mmlu-abstract-algebra.val.ab-v1
  metrics: [accuracy]
mmlu-abstract-algebra.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=abstract_algebra&split=validation

mmlu-anatomy:
  id: mmlu-anatomy.val.ab-v1
  metrics: [accuracy]
mmlu-anatomy.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=anatomy&split=validation

mmlu-astronomy:
  id: mmlu-astronomy.val.ab-v1
  metrics: [accuracy]
mmlu-astronomy.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=astronomy&split=validation

mmlu-business-ethics:
  id: mmlu-business-ethics.val.ab-v1
  metrics: [accuracy]
mmlu-business-ethics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=business_ethics&split=validation

mmlu-clinical-knowledge:
  id: mmlu-clinical-knowledge.val.ab-v1
  metrics: [accuracy]
mmlu-clinical-knowledge.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=clinical_knowledge&split=validation

mmlu-college-biology:
  id: mmlu-college-biology.val.ab-v1
  metrics: [accuracy]
mmlu-college-biology.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=college_biology&split=validation

mmlu-college-chemistry:
  id: mmlu-college-chemistry.val.ab-v1
  metrics: [accuracy]
mmlu-college-chemistry.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=college_chemistry&split=validation

mmlu-college-computer-science:
  id: mmlu-college-computer-science.val.ab-v1
  metrics: [accuracy]
mmlu-college-computer-science.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=college_computer_science&split=validation

mmlu-college-mathematics:
  id: mmlu-college-mathematics.val.ab-v1
  metrics: [accuracy]
mmlu-college-mathematics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=college_mathematics&split=validation

mmlu-college-medicine:
  id: mmlu-college-medicine.val.ab-v1
  metrics: [accuracy]
mmlu-college-medicine.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=college_medicine&split=validation

mmlu-college-physics:
  id: mmlu-college-physics.val.ab-v1
  metrics: [accuracy]
mmlu-college-physics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=college_physics&split=validation

mmlu-computer-security:
  id: mmlu-computer-security.val.ab-v1
  metrics: [accuracy]
mmlu-computer-security.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=computer_security&split=validation

mmlu-conceptual-physics:
  id: mmlu-conceptual-physics.val.ab-v1
  metrics: [accuracy]
mmlu-conceptual-physics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=conceptual_physics&split=validation

mmlu-econometrics:
  id: mmlu-econometrics.val.ab-v1
  metrics: [accuracy]
mmlu-econometrics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=econometrics&split=validation

mmlu-electrical-engineering:
  id: mmlu-electrical-engineering.val.ab-v1
  metrics: [accuracy]
mmlu-electrical-engineering.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=electrical_engineering&split=validation

mmlu-elementary-mathematics:
  id: mmlu-elementary-mathematics.val.ab-v1
  metrics: [accuracy]
mmlu-elementary-mathematics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=elementary_mathematics&split=validation

mmlu-formal-logic:
  id: mmlu-formal-logic.val.ab-v1
  metrics: [accuracy]
mmlu-formal-logic.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=formal_logic&split=validation

mmlu-global-facts:
  id: mmlu-global-facts.val.ab-v1
  metrics: [accuracy]
mmlu-global-facts.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=global_facts&split=validation

mmlu-high-school-biology:
  id: mmlu-high-school-biology.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-biology.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_biology&split=validation

mmlu-high-school-chemistry:
  id: mmlu-high-school-chemistry.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-chemistry.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_chemistry&split=validation

mmlu-high-school-computer-science:
  id: mmlu-high-school-computer-science.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-computer-science.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_computer_science&split=validation

mmlu-high-school-european-history:
  id: mmlu-high-school-european-history.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-european-history.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_european_history&split=validation

mmlu-high-school-geography:
  id: mmlu-high-school-geography.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-geography.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_geography&split=validation

mmlu-high-school-government-and-politics:
  id: mmlu-high-school-government-and-politics.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-government-and-politics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_government_and_politics&split=validation

mmlu-high-school-macroeconomics:
  id: mmlu-high-school-macroeconomics.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-macroeconomics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_macroeconomics&split=validation

mmlu-high-school-mathematics:
  id: mmlu-high-school-mathematics.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-mathematics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_mathematics&split=validation

mmlu-high-school-microeconomics:
  id: mmlu-high-school-microeconomics.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-microeconomics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_microeconomics&split=validation

mmlu-high-school-physics:
  id: mmlu-high-school-physics.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-physics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_physics&split=validation

mmlu-high-school-psychology:
  id: mmlu-high-school-psychology.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-psychology.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_psychology&split=validation

mmlu-high-school-statistics:
  id: mmlu-high-school-statistics.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-statistics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_statistics&split=validation

mmlu-high-school-us-history:
  id: mmlu-high-school-us-history.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-us-history.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_us_history&split=validation

mmlu-high-school-world-history:
  id: mmlu-high-school-world-history.val.ab-v1
  metrics: [accuracy]
mmlu-high-school-world-history.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=high_school_world_history&split=validation

mmlu-human-aging:
  id: mmlu-human-aging.val.ab-v1
  metrics: [accuracy]
mmlu-human-aging.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=human_aging&split=validation

mmlu-human-sexuality:
  id: mmlu-human-sexuality.val.ab-v1
  metrics: [accuracy]
mmlu-human-sexuality.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=human_sexuality&split=validation

mmlu-international-law:
  id: mmlu-international-law.val.ab-v1
  metrics: [accuracy]
mmlu-international-law.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=international_law&split=validation

mmlu-jurisprudence:
  id: mmlu-jurisprudence.val.ab-v1
  metrics: [accuracy]
mmlu-jurisprudence.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=jurisprudence&split=validation

mmlu-logical-fallacies:
  id: mmlu-logical-fallacies.val.ab-v1
  metrics: [accuracy]
mmlu-logical-fallacies.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=logical_fallacies&split=validation

mmlu-machine-learning:
  id: mmlu-machine-learning.val.ab-v1
  metrics: [accuracy]
mmlu-machine-learning.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=machine_learning&split=validation

mmlu-management:
  id: mmlu-management.val.ab-v1
  metrics: [accuracy]
mmlu-management.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=management&split=validation

mmlu-marketing:
  id: mmlu-marketing.val.ab-v1
  metrics: [accuracy]
mmlu-marketing.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=marketing&split=validation

mmlu-medical-genetics:
  id: mmlu-medical-genetics.val.ab-v1
  metrics: [accuracy]
mmlu-medical-genetics.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=medical_genetics&split=validation

mmlu-miscellaneous:
  id: mmlu-miscellaneous.val.ab-v1
  metrics: [accuracy]
mmlu-miscellaneous.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=miscellaneous&split=validation

mmlu-moral-disputes:
  id: mmlu-moral-disputes.val.ab-v1
  metrics: [accuracy]
mmlu-moral-disputes.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=moral_disputes&split=validation

mmlu-moral-scenarios:
  id: mmlu-moral-scenarios.val.ab-v1
  metrics: [accuracy]
mmlu-moral-scenarios.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=moral_scenarios&split=validation

mmlu-nutrition:
  id: mmlu-nutrition.val.ab-v1
  metrics: [accuracy]
mmlu-nutrition.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=nutrition&split=validation

mmlu-philosophy:
  id: mmlu-philosophy.val.ab-v1
  metrics: [accuracy]
mmlu-philosophy.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=philosophy&split=validation

mmlu-prehistory:
  id: mmlu-prehistory.val.ab-v1
  metrics: [accuracy]
mmlu-prehistory.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=prehistory&split=validation

mmlu-professional-accounting:
  id: mmlu-professional-accounting.val.ab-v1
  metrics: [accuracy]
mmlu-professional-accounting.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=professional_accounting&split=validation

mmlu-professional-law:
  id: mmlu-professional-law.val.ab-v1
  metrics: [accuracy]
mmlu-professional-law.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=professional_law&split=validation

mmlu-professional-medicine:
  id: mmlu-professional-medicine.val.ab-v1
  metrics: [accuracy]
mmlu-professional-medicine.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=professional_medicine&split=validation

mmlu-professional-psychology:
  id: mmlu-professional-psychology.val.ab-v1
  metrics: [accuracy]
mmlu-professional-psychology.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=professional_psychology&split=validation

mmlu-public-relations:
  id: mmlu-public-relations.val.ab-v1
  metrics: [accuracy]
mmlu-public-relations.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=public_relations&split=validation

mmlu-security-studies:
  id: mmlu-security-studies.val.ab-v1
  metrics: [accuracy]
mmlu-security-studies.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=security_studies&split=validation

mmlu-sociology:
  id: mmlu-sociology.val.ab-v1
  metrics: [accuracy]
mmlu-sociology.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=sociology&split=validation

mmlu-us-foreign-policy:
  id: mmlu-us-foreign-policy.val.ab-v1
  metrics: [accuracy]
mmlu-us-foreign-policy.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=us_foreign_policy&split=validation

mmlu-virology:
  id: mmlu-virology.val.ab-v1
  metrics: [accuracy]
mmlu-virology.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=virology&split=validation

mmlu-world-religions:
  id: mmlu-world-religions.val.ab-v1
  metrics: [accuracy]
mmlu-world-religions.val.ab-v1:
  class: evals.elsuite.multiple_choice:MultipleChoice
  args:
    dataset: hf://hendrycks_test?name=world_religions&split=validation
